Coalescence experiment - Microbiome - 16S

I used the Kraken2 results with MPA-style based on silva 16S dataset 138.1. Here, I only keep bacteria at family level & CLR transformed (Kraken2 Barbell).

Initiate libraries and setup working env

rm(list=ls())

# library(ape)
library(tidyverse)
library(vegan)
library(ggpubr) 
library(phyloseq) 
library(circlize)
library(ggpmisc)  # For regression equation annotation
library(broom)  # For extracting p-values
library(RColorBrewer)


mytheme <- theme_light()+
  theme(text = element_text(size = 10),
        legend.title = element_text(face = "bold", size = 10),
        legend.text = element_text(size = 10), 
        legend.justification = c("right", "top"),
        legend.background = element_rect(),
        strip.background = element_blank(),
        strip.text = element_text(face = "bold", size = 10, color = "black"),
        axis.ticks = element_line(colour = "grey70", linewidth = 0.2),
        panel.grid.major = element_line(colour = "grey70", linewidth = 0.1),
        panel.grid.minor = element_blank())  

# Path to the folder containing Kraken2 report files
folder_path <- "div_analysis_family_level/"

# Create the directory if it does not already exist
if (!dir.exists(folder_path)) {
  dir.create(folder_path)
}

1. alpha-diversity for microcosm samples - sea & fresh water

Use relative abundance table for the analysis.

# read phyloseq object from all the samples - raw reads
ps_rawreads <- readRDS("phyloseq_object_family_rawreads.rds")

otu_table_all <- as.data.frame(otu_table(ps_rawreads)) 

otu_table <- otu_table_all %>% 
  select(!grep("Mock", names(otu_table_all)) & !grep("control", names(otu_table_all)) & !grep("FieldBlank", names(otu_table_all)) & !grep("f0_b0", names(otu_table_all))) %>% 
  select(!c(IGAPark, MittelmoleWarnemunde, NeptunWerft, SABMarinaBramow, Stadthafen)) %>% 
  rename(f100_b0_f_p0_r1 = Muhlendamm ,
         f0_b100_b_p0_r1 = WarnemundeStrand) 

otu_table[1:2, 1:2]
##                                                                                                        f0_b100_b_p1_r1
## d__Bacteria|p__Abditibacteriota|c__Abditibacteria|o__Abditibacteriales|f__Abditibacteriaceae                         0
## d__Bacteria|p__Acidobacteriota|c__Acidobacteriae|o__Acidobacteriales|f__Acidobacteriaceae (Subgroup 1)               0
##                                                                                                        f0_b100_b_p1_r2
## d__Bacteria|p__Abditibacteriota|c__Abditibacteria|o__Abditibacteriales|f__Abditibacteriaceae                         0
## d__Bacteria|p__Acidobacteriota|c__Acidobacteriae|o__Acidobacteriales|f__Acidobacteriaceae (Subgroup 1)               0
# number of families x number of samples
dim(otu_table)
## [1] 357 242
# summary of 240 samples (240 microcosm samples and two field samples)
summary(colSums(otu_table))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3591   11962   19580   22195   30152   83009
# total number of reads from 240 samples
sum(colSums(otu_table))
## [1] 5371097
otu_table_rel <- otu_table %>% 
  mutate(across(everything(), ~ .x / sum(.x))) 
otu_table_rel <- t(otu_table_rel)

otu_table_rel[120:121, c(16, 22)]
##                 d__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Frankiales|f__Sporichthyaceae
## f25_b75_b_p6_r4                                                                       7.753140e-05
## f25_b75_f_p1_r1                                                                       2.321371e-05
##                 d__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Micrococcales|f__Microbacteriaceae
## f25_b75_b_p6_r4                                                                            0.000000e+00
## f25_b75_f_p1_r1                                                                            2.321371e-05
dim(otu_table_rel)
## [1] 242 357
# get diversity indexes
Shannon <- diversity(otu_table_rel, "shannon")
head(Shannon)
## f0_b100_b_p1_r1 f0_b100_b_p1_r2 f0_b100_b_p1_r3 f0_b100_b_p1_r4 f0_b100_b_p2_r1 
##        1.779974        1.722050        1.514423        1.708745        1.465360 
## f0_b100_b_p2_r2 
##        1.307836
# Simpson <- diversity(t(otu_table_rel), "simpson")
# dim(Simpson)
Richness <- specnumber(otu_table_rel)
head(Richness)
## f0_b100_b_p1_r1 f0_b100_b_p1_r2 f0_b100_b_p1_r3 f0_b100_b_p1_r4 f0_b100_b_p2_r1 
##             104              96              98              96              75 
## f0_b100_b_p2_r2 
##              70
# Pielou's J = H'/ln(S) where H' is Shannon Weiner diversity and S is the total number of species in a sample,
Pielous_evenness <- Shannon/log(Richness)

df <- as.data.frame(cbind(Richness, Shannon)) %>% 
  rownames_to_column() %>%  
  separate(rowname, c("fresh_bac", "sea_bac", "watertype", "passage", "replicate"), "_") %>%  
  select(-replicate) %>% 
  mutate(watertype = factor(watertype, levels = c("f", "b"), labels = c("Freshwater", "Seawater"))) %>% 
  unite(mix, c(fresh_bac, sea_bac), sep = "_", remove = TRUE) %>% 
  mutate(mix = factor(mix, levels = c("f0_b100", "f25_b75", "f50_b50", "f75_b25", "f100_b0"),  labels = c("0:100", "25:75", "50:50", "75:25", "100:0"))) %>% 
  mutate(passage = factor(as.numeric(str_remove(passage, "^p"))))

head(df)
##     mix watertype passage Richness  Shannon
## 1 0:100  Seawater       1      104 1.779974
## 2 0:100  Seawater       1       96 1.722050
## 3 0:100  Seawater       1       98 1.514423
## 4 0:100  Seawater       1       96 1.708745
## 5 0:100  Seawater       2       75 1.465360
## 6 0:100  Seawater       2       70 1.307836
# write.csv(df, paste0(folder_path, "richness_shannon_all_samples.csv"))

df1 <- df %>% 
  pivot_longer(-c(mix, watertype, passage), names_to = "div_index") %>% 
  group_by(mix, watertype, passage, div_index) %>%
  summarize(mean = mean(value, na.rm=TRUE), sd = round(sd(value), 3), n = round(length(value), 3)) 
## `summarise()` has grouped output by 'mix', 'watertype', 'passage'. You can
## override using the `.groups` argument.
df1$se <- df1$sd / sqrt(df1$n)  # Calculate standard error of the mean

head(df1)
## # A tibble: 6 × 8
## # Groups:   mix, watertype, passage [3]
##   mix   watertype  passage div_index  mean     sd     n     se
##   <fct> <fct>      <fct>   <chr>     <dbl>  <dbl> <dbl>  <dbl>
## 1 0:100 Freshwater 1       Richness  55.2   9.95      4 4.97  
## 2 0:100 Freshwater 1       Shannon    1.00  0.292     4 0.146 
## 3 0:100 Freshwater 2       Richness  64.8  17.7       4 8.84  
## 4 0:100 Freshwater 2       Shannon    1.24  0.214     4 0.107 
## 5 0:100 Freshwater 3       Richness  45    15.3       4 7.65  
## 6 0:100 Freshwater 3       Shannon    1.46  0.143     4 0.0715
(p <- ggplot(df1, aes(passage, mean, group =mix)) +
    geom_errorbar(aes(ymin = mean-se, ymax = mean+se, color = mix), 
                  width = 0.2, position = position_dodge(width = 0.4), alpha = 0.6) +
    geom_line(aes(color = mix), position = position_dodge(width = 0.4), alpha = 0.6) +
    geom_point(aes(fill = mix), size = 2, stroke = 0.1, position = position_dodge(width = 0.4), shape = 21, color = "black") +
    labs(x = "Passages", y = "", title = "") +
    facet_grid(div_index ~ watertype, scales = "free_y") +
    scale_fill_brewer(palette = "RdYlBu", guide=guide_legend(override.aes = list(shape=21)), name = "Mixing ratio \n(fresh:sea)")+
    scale_color_brewer(palette = "RdYlBu", name = "Mixing ratio \n(fresh:sea)")+
    mytheme)

# ggsave(paste0(folder_path, "richness_shannon_lineplot_family.png"), width = 5.5, height = 3.5, p)

2. beta-diversity for only microcosm samples (no passage 0 / field samples)

Here we did PCA analysis based on clr transformed table. > “Aitchison distance has numerous other desirable properties, such as scale invariance, negating the need to perform rarefaction. This feature is critical when one lacks access to absolute microbial abundance, because scale variant distances ensure equivalence between distances computed from absolute and relative abundance measurements.”

# read phyloseq object from all the samples - CLR transformed
ps_clr <- readRDS("phyloseq_object_family_clr.rds")
ps_clr
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 357 taxa and 272 samples ]
## sample_data() Sample Data:       [ 272 samples by 8 sample variables ]
## tax_table()   Taxonomy Table:    [ 357 taxa by 5 taxonomic ranks ]
# only keep the samples from microcosm experiment
ps_microcosm <- subset_samples(ps_clr, catagery == "microcosm")
# remove taxa do not appear in any samples
ps_microcosm <- filter_taxa(ps_microcosm, function(x) sum(x) != 0, TRUE)

# check
ps_microcosm
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 343 taxa and 240 samples ]
## sample_data() Sample Data:       [ 240 samples by 8 sample variables ]
## tax_table()   Taxonomy Table:    [ 343 taxa by 5 taxonomic ranks ]
tail(sample_data(ps_microcosm))
##                 SampleNO  catagery sub_catagery freshwaterCom brackishwaterCom
## f50_b50_f_p4_r4      166 microcosm    microcosm           f50              b50
## f75_b25_f_p4_r3      169 microcosm    microcosm           f75              b25
## f0_b100_f_p3_r2      114 microcosm    microcosm            f0             b100
## f0_b100_f_p3_r4      116 microcosm    microcosm            f0             b100
## f75_b25_b_p1_r4       24 microcosm    microcosm           f75              b25
## f0_b100_f_p4_r4      158 microcosm    microcosm            f0             b100
##                 watertype passage replicate
## f50_b50_f_p4_r4         f      p4        r4
## f75_b25_f_p4_r3         f      p4        r3
## f0_b100_f_p3_r2         f      p3        r2
## f0_b100_f_p3_r4         f      p3        r4
## f75_b25_b_p1_r4         b      p1        r4
## f0_b100_f_p4_r4         f      p4        r4
# prepare feature table for further analysis
com <- as.data.frame(otu_table(t(ps_microcosm)))

# make col.names shorter
colnames(com) <- sub(".*(f__)", "\\", colnames(com))
com[1:3, 1:3]
##                 Abditibacteriaceae Acidobacteriaceae (Subgroup 1) uncultured
## f25_b75_f_p2_r1         -1.9941169                              0          0
## f25_b75_f_p3_r1          0.3209210                              0          0
## f25_b75_f_p4_r1         -0.5822332                              0          0
###########################
# PCA
###########################

# Aitchison distance is simply the Euclidean distance between samples after clr transformation,
pca_result <- rda(com)

sum_pca <- summary(pca_result)
sum_pca$cont$importance[2, 1]
## [1] 0.2304891
sum_pca$cont$importance[2, 2]
## [1] 0.09965272
biplot(pca_result, scaling = 2)  # Scaling options: 1 (focus on variables), 2 (focus on sites)

# Extract species scores (variable contributions) from PCA
species_scores <- as.data.frame(scores(pca_result, display = "species"))

# Set the threshold for a feature/family thet longer than certain values at PC1 or PC2
threshold <- 1

# Filter species based on PC1 or PC2 exceeding the threshold
filtered_species <- species_scores[abs(species_scores$PC1) > threshold | abs(species_scores$PC2) > threshold, ]
head(filtered_species)
##                            PC1           PC2
## Cyclobacteriaceae   -1.3203459  0.0008841836
## Spirosomaceae        1.7101496  0.9450120874
## Sphingobacteriaceae  1.3160709  0.3364114546
## Bacillaceae          0.2737779 -1.1774890817
## Planococcaceae       0.3809445 -1.2688189384
## Aeromonadaceae      -0.6909358  2.0807396685
row.names(filtered_species)
##  [1] "Cyclobacteriaceae"      "Spirosomaceae"          "Sphingobacteriaceae"   
##  [4] "Bacillaceae"            "Planococcaceae"         "Aeromonadaceae"        
##  [7] "Alteromonadaceae"       "Colwelliaceae"          "Pseudoalteromonadaceae"
## [10] "Shewanellaceae"         "Comamonadaceae"         "Oxalobacteraceae"      
## [13] "Enterobacteriaceae"     "Yersiniaceae"           "Marinomonadaceae"      
## [16] "Vibrionaceae"
# Site scores (coordinates of samples in PCA space)
df <- as.data.frame(scores(pca_result, display = "sites"))

# Add metadata
df <- df %>% 
  rownames_to_column() %>%  
  separate(rowname, c("fresh_bac", "sea_bac", "watertype", "passage", "replicate"), "_", remove = FALSE) %>%  
  # select(-replicate) %>% 
  mutate(watertype = factor(watertype, levels = c("f", "b"), labels = c("Freshwater", "Seawater"))) %>% 
  unite(mix, c(fresh_bac, sea_bac), sep = "_", remove = TRUE) %>% 
  mutate(mix = factor(mix, levels = c("f0_b100", "f25_b75", "f50_b50", "f75_b25", "f100_b0"),  labels = c("0:100", "25:75", "50:50", "75:25", "100:0"))) %>% 
  mutate(passage = factor(as.numeric(str_remove(passage, "^p"))))
# mutate(passage = factor(passage, levels = c("p0", "p1", "p2", "p3", "p4", "p5", "p6"))) 

head(df)
##           rowname   mix  watertype passage replicate       PC1         PC2
## 1 f25_b75_f_p2_r1 25:75 Freshwater       2        r1 0.4462170  1.91802547
## 2 f25_b75_f_p3_r1 25:75 Freshwater       3        r1 1.1792167  0.65538701
## 3 f25_b75_f_p4_r1 25:75 Freshwater       4        r1 0.9235986 -0.08924358
## 4 f25_b75_f_p5_r1 25:75 Freshwater       5        r1 0.6120449  0.96547827
## 5 f25_b75_f_p6_r1 25:75 Freshwater       6        r1 0.9405888  0.08467159
## 6 f50_b50_f_p6_r1 50:50 Freshwater       6        r1 0.7155475  0.86077381
# only color by water
(p1 <- ggplot(df, aes(PC1, PC2)) +
    geom_point(aes(fill = watertype, shape = watertype), size = 2.5, stroke = 0.1) + 
    labs(x = paste("PCA1 (", round(sum_pca$cont$importance[2, 1] * 100, 2), "%)", sep = ""), 
         y = paste("PCA2 (", round(sum_pca$cont$importance[2, 2] * 100, 2), "%)", sep = ""), 
         title = "") +
    scale_fill_manual(values = c("#4575B4", "#D73027"), guide=guide_legend(override.aes = list(shape=21)), name = "Culture water") +
    scale_shape_manual(values = c(22, 21), name = "Culture water") +
    mytheme)

# facet by mixing ratios
(p2 <- ggplot(df, aes(PC1, PC2, shape = watertype, fill = mix)) +
    geom_point(size = 2.5, alpha = 0.6, stroke = 0.1) + 
    facet_grid(watertype ~ passage) +
    labs(x = paste("PCA1 (", round(sum_pca$cont$importance[2, 1] * 100, 2), "%)", sep = ""), 
         y = paste("PCA2 (", round(sum_pca$cont$importance[2, 2] * 100, 2), "%)", sep = ""), 
         title = "") +
    scale_fill_brewer(palette = "RdYlBu", guide=guide_legend(override.aes = list(shape=21)), name = "Mixing ratio\n(fresh:sea)")+
    scale_shape_manual(values = c(22, 21), name = "Culture water") +
    mytheme)

# facet by passage
(p3 <- ggplot(df, aes(PC1, PC2, shape = watertype, fill = passage)) +
    geom_point(size = 2.5, alpha = 0.6, stroke = 0.1) + 
    facet_grid(watertype ~ mix) +
    labs(x = paste("PCA1 (", round(sum_pca$cont$importance[2, 1] * 100, 2), "%)", sep = ""), 
         y = paste("PCA2 (", round(sum_pca$cont$importance[2, 2] * 100, 2), "%)", sep = ""), 
         title = "") +
    scale_fill_brewer(palette = "YlGn", guide=guide_legend(override.aes = list(shape=21)), name = "Passages")+
    scale_shape_manual(values = c(22, 21), guide = "none") +
    mytheme)

# (p23 <- ggarrange(p2, p3, labels = c("A", "B"), font.label = list(size = 10), widths = c(1, 0.7), nrow = 2, ncol = 1))

# ggsave(paste0(folder_path, "PCA_all_combined_facet_clr_no_p0_legend.pdf"), width = 8, height = 6, p23)


# PCA plot with family names when their length at PC1 or PC2 > threshold

(p4 <- ggplot(df, aes(PC1, PC2)) +
    geom_point(aes(fill = mix, shape = watertype), size = 3.5, alpha = 0.6, stroke = 0.1) + 
    geom_text(aes(label = passage), size = 1.2, alpha = 0.4, color = "black") +
    geom_segment(data = filtered_species, aes(x = 0, y = 0, xend = PC1, yend = PC2), arrow = arrow(length = unit(0.2, "cm")), linewidth = 0.1, color = "black", alpha = 0.6) +
    geom_text(data = filtered_species, aes(x = PC1*0.95, y = PC2*0.95, label = rownames(filtered_species)), color = "black", alpha = 0.8, size = 3, vjust = -0.5) +
    labs(x = paste("PCA1 (", round(sum_pca$cont$importance[2, 1] * 100, 2), "%)", sep = ""), y = paste("PCA2 (", round(sum_pca$cont$importance[2, 2] * 100, 2), "%)", sep = ""), title = "") +
    scale_fill_brewer(palette = "RdYlBu", guide=guide_legend(override.aes = list(shape=21)), name = "Mixing ratio\n(fresh:sea)")+
    scale_shape_manual(values = c(22, 21), name = "Culture water") +
    mytheme)

# ggsave(paste0(folder_path,"PCA_water_family_kraken_barbell_clr_water_mix_no_p0_raw.pdf"), width = 5, height = 3.5, p4)

# make plot without arrows
summary(df)
##    rowname             mix          watertype   passage  replicate        
##  Length:240         0:100:48   Freshwater:120   1:40    Length:240        
##  Class :character   25:75:48   Seawater  :120   2:40    Class :character  
##  Mode  :character   50:50:48                    3:40    Mode  :character  
##                     75:25:48                    4:40                      
##                     100:0:48                    5:40                      
##                                                 6:40                      
##       PC1               PC2          
##  Min.   :-1.4623   Min.   :-2.29297  
##  1st Qu.:-0.9055   1st Qu.:-0.68410  
##  Median : 0.2004   Median : 0.09374  
##  Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.8873   3rd Qu.: 0.61582  
##  Max.   : 1.4795   Max.   : 1.91803
# make the bundary
min_x = min(df$PC1) + 0.2
max_x = max(df$PC1) + 0.2
min_y = min(df$PC2) + 0.2
max_y = max(df$PC2) + 0.2

colors <- brewer.pal(n = 11, name = "RdYlBu")
print(colors)
##  [1] "#A50026" "#D73027" "#F46D43" "#FDAE61" "#FEE090" "#FFFFBF" "#E0F3F8"
##  [8] "#ABD9E9" "#74ADD1" "#4575B4" "#313695"
(p5 <- ggplot(df, aes(PC1, PC2)) +
    geom_point(aes(fill = mix, shape = watertype), size = 3, alpha = 0.6, stroke = 0.1) + 
    geom_text(aes(label = passage), size = 2, alpha = 0.4, color = "black") +
    labs(x = paste("PCA1 (", round(sum_pca$cont$importance[2, 1] * 100, 2), "%)", sep = ""), 
         y = paste("PCA2 (", round(sum_pca$cont$importance[2, 2] * 100, 2), "%)", sep = ""), 
         title = "") +
    # scale_x_continuous(limits = c(min_x, max_x)) +
    # scale_y_continuous(limits = c(min_y, max_y)) +
    scale_fill_brewer(palette = "RdYlBu", guide=guide_legend(override.aes = list(shape=21)), name = "Mixing ratio\n(fresh:sea)") +
    scale_shape_manual(values = c(22, 21), name = "Culture water") +
    mytheme)

# ggsave(paste0(folder_path, "PCA_water_family_kraken_barbell_clr_water_mix_no_p0_all.png"), width = 4.5, height = 3, p5)



###########################
# permanova
###########################

# Permutational Multivariate Analysis of Variance Using Distance Matrices

dist <- vegdist(com, method = "euclidean") 

df <- as.data.frame(com[,1:2]) %>% 
  rownames_to_column() %>%  
  select(rowname) %>% 
  separate(rowname, c("fresh_bac", "sea_bac", "watertype", "passage", "replicate"), "_", remove = FALSE) %>%  
  column_to_rownames(var = "rowname") %>% 
  mutate(watertype = factor(watertype, levels = c("f", "b"), labels = c("freshwater", "seawater"))) %>% 
  unite(mix, c(fresh_bac, sea_bac), sep = "_", remove = TRUE) %>% 
  mutate(mix = factor(mix, levels = c("f0_b100", "f25_b75", "f50_b50", "f75_b25", "f100_b0"))) %>% 
  mutate(passage = factor(passage, levels = c("p0", "p1", "p2", "p3", "p4", "p5", "p6")))

head(df)
##                     mix  watertype passage replicate
## f25_b75_f_p2_r1 f25_b75 freshwater      p2        r1
## f25_b75_f_p3_r1 f25_b75 freshwater      p3        r1
## f25_b75_f_p4_r1 f25_b75 freshwater      p4        r1
## f25_b75_f_p5_r1 f25_b75 freshwater      p5        r1
## f25_b75_f_p6_r1 f25_b75 freshwater      p6        r1
## f50_b50_f_p6_r1 f50_b50 freshwater      p6        r1
all(rownames(as.matrix(dist)) == rownames(df))
## [1] TRUE
# # three-way permanova
set.seed(123)
# sequential tests
result_whole <- adonis2(dist ~ mix*watertype*passage, data = df, permutation = 999, by = "terms")
#result_whole <- adonis2(com ~ mix*watertype*passage, data = df, method = "euclidean", permutation = 999)
result_whole
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = dist ~ mix * watertype * passage, data = df, permutations = 999, by = "terms")
##                        Df SumOfSqs      R2       F Pr(>F)    
## mix                     4     3583 0.08261  7.9334  0.001 ***
## watertype               1     8633 0.19902 76.4549  0.001 ***
## passage                 5     2141 0.04936  3.7928  0.001 ***
## mix:watertype           4     2803 0.06462  6.2059  0.001 ***
## mix:passage            20     2094 0.04828  0.9274  0.792    
## watertype:passage       5     1465 0.03377  2.5944  0.001 ***
## mix:watertype:passage  20     2333 0.05378  1.0331  0.327    
## Residual              180    20325 0.46856                   
## Total                 239    43378 1.00000                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# write.csv(result_whole, paste0(folder_path, "permanova_water_mix_passage_family_Aitchison.csv"))

2.2 Dissimilarity to source communities analysis

To assess the influence of each source community on the coalesced community.

# pairwise dissimilarity
dist <- vegdist(com, method = "euclidean", binary = FALSE, diag = 1) 

# Convert distance to dissimilarity in the range [0, 1]
# dissimilarity <- dist / max(dist)

# df <- as.matrix(dissimilarity)
df <- as.matrix(dist)
df <- data.frame(as.table(df))[lower.tri(df, diag = FALSE), ]
cat("For no. of df pairs: Observed = Predict", (240*240-240)/2 == length(df$Freq), "\n")
## For no. of df pairs: Observed = Predict TRUE
row.names(df) <- paste(df$Var1, df$Var2, sep = "_")
summary(df)
##               Var1                    Var2            Freq       
##  f0_b100_f_p4_r4:  239   f25_b75_f_p2_r1:  239   Min.   : 7.933  
##  f75_b25_b_p1_r4:  238   f25_b75_f_p3_r1:  238   1st Qu.:16.587  
##  f0_b100_f_p3_r4:  237   f25_b75_f_p4_r1:  237   Median :19.041  
##  f0_b100_f_p3_r2:  236   f25_b75_f_p5_r1:  236   Mean   :18.788  
##  f75_b25_f_p4_r3:  235   f25_b75_f_p6_r1:  235   3rd Qu.:21.061  
##  f50_b50_f_p4_r4:  234   f50_b50_f_p6_r1:  234   Max.   :28.842  
##  (Other)        :27261   (Other)        :27261
# define source communities, i.e. f100-f & b100-b 
df_source <- df[grepl("f100_b0_f", df$Var1) | grepl("f100_b0_f", df$Var2) | grepl("f0_b100_b", df$Var1) | grepl("f0_b100_b", df$Var2), ]

df_source <- df_source %>% 
  rownames_to_column(var = "name") %>% 
  separate(name, c("fresh_a", "sea_a", "water_a", "passage_a", "replicate_a", "fresh_b", "sea_b", "water_b", "passage_b", "replicate_b"), "_", remove = FALSE) %>%
  filter(passage_a == passage_b) %>% 
  select(-c(replicate_a, replicate_b, passage_b)) %>%
  rename(passage = passage_a) %>% 
  mutate(passage = factor(gsub("p", "", passage))) %>% 
  unite(a, c(fresh_a, sea_a), sep = "_", remove = TRUE) %>% 
  unite(b, c(fresh_b, sea_b), sep = "_", remove = TRUE) %>% 
  select(-c(Var1, Var2)) %>%
  column_to_rownames(var = "name")

dim(df_source)
## [1] 1704    6
tail(df_source)
##                                       a water_a passage       b water_b
## f25_b75_b_p5_r2_f0_b100_b_p5_r1 f25_b75       b       5 f0_b100       b
## f25_b75_b_p5_r4_f0_b100_b_p5_r1 f25_b75       b       5 f0_b100       b
## f50_b50_b_p5_r3_f0_b100_b_p5_r1 f50_b50       b       5 f0_b100       b
## f50_b50_f_p5_r2_f0_b100_b_p5_r1 f50_b50       f       5 f0_b100       b
## f50_b50_f_p5_r4_f0_b100_b_p5_r1 f50_b50       f       5 f0_b100       b
## f75_b25_b_p5_r3_f0_b100_b_p5_r1 f75_b25       b       5 f0_b100       b
##                                     Freq
## f25_b75_b_p5_r2_f0_b100_b_p5_r1 13.51981
## f25_b75_b_p5_r4_f0_b100_b_p5_r1 16.69061
## f50_b50_b_p5_r3_f0_b100_b_p5_r1 12.05826
## f50_b50_f_p5_r2_f0_b100_b_p5_r1 16.47511
## f50_b50_f_p5_r4_f0_b100_b_p5_r1 19.06562
## f75_b25_b_p5_r3_f0_b100_b_p5_r1 16.26554
# in freshwater media
f_levels <- c("f100_b0_f", "f75_b25_f", "f50_b50_f", "f25_b75_f", "f0_b100_f", "f0_b100_b")

df_f <- df_source %>% 
  unite(mix_a, c(a, water_a), sep = "_") %>% 
  unite(mix_b, c(b, water_b), sep = "_") %>% 
  mutate(mix_a = factor(mix_a)) %>% 
  mutate(mix_b = factor(mix_b)) %>% 
  filter(mix_a %in% f_levels & mix_b %in% f_levels) %>% 
  unite(mix, c(mix_a, mix_b), sep = "_") %>% 
  mutate(mix = factor(mix)) %>% 
  filter(mix != "f0_b100_b_f0_b100_b") %>% 
  droplevels()

df_f$watertype <- "Freshwater"

levels(df_f$mix)
##  [1] "f0_b100_b_f0_b100_f" "f0_b100_b_f100_b0_f" "f0_b100_b_f25_b75_f"
##  [4] "f0_b100_b_f50_b50_f" "f0_b100_b_f75_b25_f" "f0_b100_f_f0_b100_b"
##  [7] "f0_b100_f_f100_b0_f" "f100_b0_f_f0_b100_b" "f100_b0_f_f0_b100_f"
## [10] "f100_b0_f_f100_b0_f" "f100_b0_f_f25_b75_f" "f100_b0_f_f50_b50_f"
## [13] "f100_b0_f_f75_b25_f" "f25_b75_f_f0_b100_b" "f25_b75_f_f100_b0_f"
## [16] "f50_b50_f_f0_b100_b" "f50_b50_f_f100_b0_f" "f75_b25_f_f0_b100_b"
## [19] "f75_b25_f_f100_b0_f"
# Add source_com and mix_ratio columns
df_f <- df_f %>%
  mutate(
    source_com = ifelse(grepl("f0_b100_b", mix), "sea", "fresh"),
    mix_ratio = case_when(
      grepl("f0_b100_f", mix) ~ "0:100",
      grepl("f25_b75", mix) ~ "25:75",
      grepl("f50_b50", mix) ~ "50:50",
      grepl("f75_b25", mix) ~ "75:25",
      TRUE ~ "100:0"  # Default value
    )
  ) %>% 
  mutate(mix_simple = paste(mix_ratio, "vs.", source_com))

f_factor_levels <- c("100:0 vs. fresh", 
                     "75:25 vs. fresh", 
                     "50:50 vs. fresh", 
                     "25:75 vs. fresh",
                     "0:100 vs. fresh",
                     "100:0 vs. sea",  
                     "75:25 vs. sea", 
                     "50:50 vs. sea", 
                     "25:75 vs. sea", 
                     "0:100 vs. sea")

# I should have 4*4*6 pairs for each coalesced com to a source comunity, for the source community itsefl, which should have 6*6 pairs
df_f <- df_f %>% 
  mutate(mix_simple = factor(mix_simple, levels = f_factor_levels)) 

# check results
head(df_f)
##                                                 mix passage     Freq  watertype
## f0_b100_b_p2_r2_f25_b75_f_p2_r1 f0_b100_b_f25_b75_f       2 21.07002 Freshwater
## f0_b100_b_p2_r3_f25_b75_f_p2_r1 f0_b100_b_f25_b75_f       2 21.89174 Freshwater
## f100_b0_f_p2_r1_f25_b75_f_p2_r1 f100_b0_f_f25_b75_f       2 15.56770 Freshwater
## f0_b100_b_p2_r1_f25_b75_f_p2_r1 f0_b100_b_f25_b75_f       2 21.62564 Freshwater
## f0_b100_b_p2_r4_f25_b75_f_p2_r1 f0_b100_b_f25_b75_f       2 21.88233 Freshwater
## f100_b0_f_p2_r4_f25_b75_f_p2_r1 f100_b0_f_f25_b75_f       2 14.00560 Freshwater
##                                 source_com mix_ratio      mix_simple
## f0_b100_b_p2_r2_f25_b75_f_p2_r1        sea     25:75   25:75 vs. sea
## f0_b100_b_p2_r3_f25_b75_f_p2_r1        sea     25:75   25:75 vs. sea
## f100_b0_f_p2_r1_f25_b75_f_p2_r1      fresh     25:75 25:75 vs. fresh
## f0_b100_b_p2_r1_f25_b75_f_p2_r1        sea     25:75   25:75 vs. sea
## f0_b100_b_p2_r4_f25_b75_f_p2_r1        sea     25:75   25:75 vs. sea
## f100_b0_f_p2_r4_f25_b75_f_p2_r1      fresh     25:75 25:75 vs. fresh
dim(df_f) # number of rows should be 900 (96*9 + 36)
## [1] 900   7
df_f %>% filter(mix_ratio == "25:75" & source_com == "sea" & passage == "1") 
##                                                 mix passage     Freq  watertype
## f25_b75_f_p1_r1_f0_b100_b_p1_r1 f25_b75_f_f0_b100_b       1 24.11564 Freshwater
## f25_b75_f_p1_r3_f0_b100_b_p1_r1 f25_b75_f_f0_b100_b       1 22.36943 Freshwater
## f25_b75_f_p1_r2_f0_b100_b_p1_r1 f25_b75_f_f0_b100_b       1 23.43745 Freshwater
## f25_b75_f_p1_r4_f0_b100_b_p1_r1 f25_b75_f_f0_b100_b       1 22.23588 Freshwater
## f25_b75_f_p1_r1_f0_b100_b_p1_r2 f25_b75_f_f0_b100_b       1 22.04776 Freshwater
## f25_b75_f_p1_r3_f0_b100_b_p1_r2 f25_b75_f_f0_b100_b       1 21.04894 Freshwater
## f25_b75_f_p1_r2_f0_b100_b_p1_r2 f25_b75_f_f0_b100_b       1 21.68552 Freshwater
## f25_b75_f_p1_r4_f0_b100_b_p1_r2 f25_b75_f_f0_b100_b       1 20.41457 Freshwater
## f25_b75_f_p1_r1_f0_b100_b_p1_r3 f25_b75_f_f0_b100_b       1 23.42012 Freshwater
## f25_b75_f_p1_r3_f0_b100_b_p1_r3 f25_b75_f_f0_b100_b       1 22.00670 Freshwater
## f25_b75_f_p1_r2_f0_b100_b_p1_r3 f25_b75_f_f0_b100_b       1 23.00056 Freshwater
## f25_b75_f_p1_r4_f0_b100_b_p1_r3 f25_b75_f_f0_b100_b       1 21.97839 Freshwater
## f25_b75_f_p1_r1_f0_b100_b_p1_r4 f25_b75_f_f0_b100_b       1 23.67377 Freshwater
## f25_b75_f_p1_r3_f0_b100_b_p1_r4 f25_b75_f_f0_b100_b       1 21.85926 Freshwater
## f25_b75_f_p1_r2_f0_b100_b_p1_r4 f25_b75_f_f0_b100_b       1 22.56887 Freshwater
## f25_b75_f_p1_r4_f0_b100_b_p1_r4 f25_b75_f_f0_b100_b       1 21.49220 Freshwater
##                                 source_com mix_ratio    mix_simple
## f25_b75_f_p1_r1_f0_b100_b_p1_r1        sea     25:75 25:75 vs. sea
## f25_b75_f_p1_r3_f0_b100_b_p1_r1        sea     25:75 25:75 vs. sea
## f25_b75_f_p1_r2_f0_b100_b_p1_r1        sea     25:75 25:75 vs. sea
## f25_b75_f_p1_r4_f0_b100_b_p1_r1        sea     25:75 25:75 vs. sea
## f25_b75_f_p1_r1_f0_b100_b_p1_r2        sea     25:75 25:75 vs. sea
## f25_b75_f_p1_r3_f0_b100_b_p1_r2        sea     25:75 25:75 vs. sea
## f25_b75_f_p1_r2_f0_b100_b_p1_r2        sea     25:75 25:75 vs. sea
## f25_b75_f_p1_r4_f0_b100_b_p1_r2        sea     25:75 25:75 vs. sea
## f25_b75_f_p1_r1_f0_b100_b_p1_r3        sea     25:75 25:75 vs. sea
## f25_b75_f_p1_r3_f0_b100_b_p1_r3        sea     25:75 25:75 vs. sea
## f25_b75_f_p1_r2_f0_b100_b_p1_r3        sea     25:75 25:75 vs. sea
## f25_b75_f_p1_r4_f0_b100_b_p1_r3        sea     25:75 25:75 vs. sea
## f25_b75_f_p1_r1_f0_b100_b_p1_r4        sea     25:75 25:75 vs. sea
## f25_b75_f_p1_r3_f0_b100_b_p1_r4        sea     25:75 25:75 vs. sea
## f25_b75_f_p1_r2_f0_b100_b_p1_r4        sea     25:75 25:75 vs. sea
## f25_b75_f_p1_r4_f0_b100_b_p1_r4        sea     25:75 25:75 vs. sea
# in seawater media
s_levels <- c("f0_b100_b", "f25_b75_b", "f50_b50_b", "f75_b25_b", "f100_b0_b", "f100_b0_f")

df_s <- df_source %>% 
  unite(mix_a, c(a, water_a), sep = "_") %>% 
  unite(mix_b, c(b, water_b), sep = "_") %>% 
  mutate(mix_a = factor(mix_a)) %>% 
  mutate(mix_b = factor(mix_b)) %>% 
  filter(mix_a %in% s_levels & mix_b %in% s_levels) %>% 
  unite(mix, c(mix_a, mix_b), sep = "_") %>% 
  mutate(mix = factor(mix)) %>% 
  filter(mix != "f100_b0_f_f100_b0_f") %>%
  droplevels()

df_s$watertype <- "Seawater"

levels(df_s$mix)
##  [1] "f0_b100_b_f0_b100_b" "f0_b100_b_f100_b0_b" "f0_b100_b_f100_b0_f"
##  [4] "f0_b100_b_f25_b75_b" "f0_b100_b_f50_b50_b" "f0_b100_b_f75_b25_b"
##  [7] "f100_b0_b_f0_b100_b" "f100_b0_b_f100_b0_f" "f100_b0_f_f0_b100_b"
## [10] "f100_b0_f_f100_b0_b" "f100_b0_f_f25_b75_b" "f100_b0_f_f50_b50_b"
## [13] "f100_b0_f_f75_b25_b" "f25_b75_b_f0_b100_b" "f25_b75_b_f100_b0_f"
## [16] "f50_b50_b_f0_b100_b" "f50_b50_b_f100_b0_f" "f75_b25_b_f0_b100_b"
## [19] "f75_b25_b_f100_b0_f"
# Add source_com and mix_ratio columns
df_s <- df_s %>%
  mutate(
    source_com = ifelse(grepl("f100_b0_f", mix), "fresh", "sea"),
    mix_ratio = case_when(
      grepl("f100_b0_b", mix) ~ "100:0",
      grepl("f25_b75", mix) ~ "25:75",
      grepl("f50_b50", mix) ~ "50:50",
      grepl("f75_b25", mix) ~ "75:25",
      TRUE ~ "0:100"  # Default value
    )
  ) %>% 
  mutate(mix_simple = paste(mix_ratio, "vs.", source_com))

b_factor_levels <- c("100:0 vs. fresh",
                     "75:25 vs. fresh",
                     "50:50 vs. fresh", 
                     "25:75 vs. fresh", 
                     "0:100 vs. fresh", 
                     "100:0 vs. sea",
                     "75:25 vs. sea",
                     "50:50 vs. sea",
                     "25:75 vs. sea", 
                     "0:100 vs. sea")

df_s <- df_s %>% 
  mutate(mix_simple = factor(mix_simple, levels = b_factor_levels))

# check results
dim(df_s) # number of rows should be 900 (96*9 + 36)
## [1] 900   7
df_s %>% filter(mix_ratio == "0:100" & source_com == "sea" & passage == "1") 
##                                                 mix passage     Freq watertype
## f0_b100_b_p1_r2_f0_b100_b_p1_r1 f0_b100_b_f0_b100_b       1 14.75785  Seawater
## f0_b100_b_p1_r3_f0_b100_b_p1_r1 f0_b100_b_f0_b100_b       1 12.62805  Seawater
## f0_b100_b_p1_r4_f0_b100_b_p1_r1 f0_b100_b_f0_b100_b       1 13.40322  Seawater
## f0_b100_b_p1_r3_f0_b100_b_p1_r2 f0_b100_b_f0_b100_b       1 12.89964  Seawater
## f0_b100_b_p1_r4_f0_b100_b_p1_r2 f0_b100_b_f0_b100_b       1 12.42332  Seawater
## f0_b100_b_p1_r4_f0_b100_b_p1_r3 f0_b100_b_f0_b100_b       1 13.14045  Seawater
##                                 source_com mix_ratio    mix_simple
## f0_b100_b_p1_r2_f0_b100_b_p1_r1        sea     0:100 0:100 vs. sea
## f0_b100_b_p1_r3_f0_b100_b_p1_r1        sea     0:100 0:100 vs. sea
## f0_b100_b_p1_r4_f0_b100_b_p1_r1        sea     0:100 0:100 vs. sea
## f0_b100_b_p1_r3_f0_b100_b_p1_r2        sea     0:100 0:100 vs. sea
## f0_b100_b_p1_r4_f0_b100_b_p1_r2        sea     0:100 0:100 vs. sea
## f0_b100_b_p1_r4_f0_b100_b_p1_r3        sea     0:100 0:100 vs. sea
# combine results from freshwater and seawater
df_fs <- rbind(df_f, df_s) %>% 
  select(mix, watertype, mix_simple, mix_ratio, source_com, passage, Freq)
str(df_fs)
## 'data.frame':    1800 obs. of  7 variables:
##  $ mix       : Factor w/ 36 levels "f0_b100_b_f0_b100_f",..: 3 3 11 3 3 11 11 11 11 11 ...
##  $ watertype : chr  "Freshwater" "Freshwater" "Freshwater" "Freshwater" ...
##  $ mix_simple: Factor w/ 10 levels "100:0 vs. fresh",..: 9 9 4 9 9 4 4 4 4 4 ...
##  $ mix_ratio : chr  "25:75" "25:75" "25:75" "25:75" ...
##  $ source_com: chr  "sea" "sea" "fresh" "sea" ...
##  $ passage   : Factor w/ 6 levels "1","2","3","4",..: 2 2 2 2 2 2 2 2 3 3 ...
##  $ Freq      : num  21.1 21.9 15.6 21.6 21.9 ...
cat("I should have these much pairs", (9*16+6)*6*2)
## I should have these much pairs 1800
warning("Check if the row number is right or not?!")
## Warning: Check if the row number is right or not?!
###########################
# box plot 
###########################

# compare two source communities
(p <- ggplot(df_fs, aes(passage, Freq, fill = mix_simple)) +
   geom_boxplot(alpha = 0.8, position = position_dodge(0.8), outlier.size=-1, linewidth = 0.1) +
   geom_point(size = 1, stroke = 0.1, position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.8), shape = 21, color = "black", alpha = 0.6) +
   facet_grid(watertype~.) +
   labs(x = "Passages", y = "Aitchison distance to the native/source communities", title = "") +
   scale_fill_brewer(palette = "RdYlBu", guide=guide_legend(override.aes = list(shape=21)), name = "Mixing ratio vs. native com.\n(fresh:sea)", direction = -1) + 
   mytheme) #+

# theme(panel.background = element_rect(fill = "lavenderblush1"),
#       strip.text = element_blank())) 

# ggsave(paste0(folder_path, "distance_to_source_communities_box_plot_raw.pdf"), width = 7, height = 3.5, p)


df_fs1 <- df_fs %>% 
  mutate(source_com = factor(source_com, levels = c("fresh", "sea"), labels = c("freshwater community", "seawater community"))) %>% 
  mutate(mix_ratio = factor(mix_ratio, levels = c("100:0", "75:25", "50:50", "25:75", "0:100" )))

# compare two culturing water conditions
(p <- ggplot(df_fs1, aes(interaction(watertype, passage), Freq, fill = mix_ratio)) +
    geom_boxplot(alpha = 0.8, position = position_dodge(0.8), outlier.size=-1, linewidth = 0.1) +
    geom_point(size = 1, stroke = 0.1, position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.8), shape = 21, color = "black", alpha = 0.6) +
    facet_grid(source_com~.) +
    labs(x = "Culturing water x Passages", y = "Aitchison distance to the native communities", title = "") +
    scale_fill_brewer(palette = "RdYlBu", guide=guide_legend(override.aes = list(shape=21)), name = "Mixing ratio vs. native com.\n(fresh:sea)", direction = -1) + 
    mytheme ) # + theme(axis.text.x = element_text(angle = 45, hjust = 1)))

# ggsave(paste0(folder_path, "distance_to_source_communities_box_plot_source_com_raw.pdf"), width = 9, height = 3.8, p)


###########################
# line plot for all passages
###########################

df_fs1 <- df_fs %>% 
  separate(mix_ratio, c("fresh_ratio", "sea_ratio"), ":", remove = FALSE) %>% 
  select(-c(mix, mix_simple))

df_fs1 <- df_fs1 %>%
  mutate(source_ratio = if_else(source_com == "fresh", fresh_ratio, sea_ratio))

df_fs1$source_ratio <- as.numeric(as.character(df_fs1$source_ratio))
df_fs1$fresh_ratio <- as.numeric(as.character(df_fs1$fresh_ratio))

head(df_fs1)
##                                  watertype mix_ratio fresh_ratio sea_ratio
## f0_b100_b_p2_r2_f25_b75_f_p2_r1 Freshwater     25:75          25        75
## f0_b100_b_p2_r3_f25_b75_f_p2_r1 Freshwater     25:75          25        75
## f100_b0_f_p2_r1_f25_b75_f_p2_r1 Freshwater     25:75          25        75
## f0_b100_b_p2_r1_f25_b75_f_p2_r1 Freshwater     25:75          25        75
## f0_b100_b_p2_r4_f25_b75_f_p2_r1 Freshwater     25:75          25        75
## f100_b0_f_p2_r4_f25_b75_f_p2_r1 Freshwater     25:75          25        75
##                                 source_com passage     Freq source_ratio
## f0_b100_b_p2_r2_f25_b75_f_p2_r1        sea       2 21.07002           75
## f0_b100_b_p2_r3_f25_b75_f_p2_r1        sea       2 21.89174           75
## f100_b0_f_p2_r1_f25_b75_f_p2_r1      fresh       2 15.56770           25
## f0_b100_b_p2_r1_f25_b75_f_p2_r1        sea       2 21.62564           75
## f0_b100_b_p2_r4_f25_b75_f_p2_r1        sea       2 21.88233           75
## f100_b0_f_p2_r4_f25_b75_f_p2_r1      fresh       2 14.00560           25
# Step 1: Compute p-values for each group
lm_results <- df_fs1 %>%
  group_by(watertype, source_com) %>%
  do({
    model <- lm(Freq ~ source_ratio, data = .)
    tidy(model)  # Extract regression statistics
  }) %>%
  filter(term == "source_ratio") %>%  # Only keep slope term (not intercept)
  mutate(significant = p.value < 0.05)  # Mark significant regressions

# Step 2: Merge with original data to keep significant groups only
df_filtered <- df_fs1 %>%
  inner_join(lm_results, by = c("watertype", "source_com")) %>% 
  mutate(source_com = factor(source_com, levels = c("fresh", "sea"), labels = c("Freshwater\nSource Community", "Seawater\nSource Community")))

df_fs1 <- df_fs1 %>% 
  mutate(source_com = factor(source_com, levels = c("fresh", "sea"), labels = c("Freshwater\nSource Community", "Seawater\nSource Community")))
head(df_filtered)
##    watertype mix_ratio fresh_ratio sea_ratio                   source_com
## 1 Freshwater     25:75          25        75   Seawater\nSource Community
## 2 Freshwater     25:75          25        75   Seawater\nSource Community
## 3 Freshwater     25:75          25        75 Freshwater\nSource Community
## 4 Freshwater     25:75          25        75   Seawater\nSource Community
## 5 Freshwater     25:75          25        75   Seawater\nSource Community
## 6 Freshwater     25:75          25        75 Freshwater\nSource Community
##   passage     Freq source_ratio         term     estimate   std.error
## 1       2 21.07002           75 source_ratio -0.004836835 0.002329795
## 2       2 21.89174           75 source_ratio -0.004836835 0.002329795
## 3       2 15.56770           25 source_ratio -0.044714568 0.003839986
## 4       2 21.62564           75 source_ratio -0.004836835 0.002329795
## 5       2 21.88233           75 source_ratio -0.004836835 0.002329795
## 6       2 14.00560           25 source_ratio -0.044714568 0.003839986
##    statistic      p.value significant
## 1  -2.076078 3.842067e-02        TRUE
## 2  -2.076078 3.842067e-02        TRUE
## 3 -11.644462 2.468113e-27        TRUE
## 4  -2.076078 3.842067e-02        TRUE
## 5  -2.076078 3.842067e-02        TRUE
## 6 -11.644462 2.468113e-27        TRUE
str(df_fs1)
## 'data.frame':    1800 obs. of  8 variables:
##  $ watertype   : chr  "Freshwater" "Freshwater" "Freshwater" "Freshwater" ...
##  $ mix_ratio   : chr  "25:75" "25:75" "25:75" "25:75" ...
##  $ fresh_ratio : num  25 25 25 25 25 25 25 25 25 25 ...
##  $ sea_ratio   : chr  "75" "75" "75" "75" ...
##  $ source_com  : Factor w/ 2 levels "Freshwater\nSource Community",..: 2 2 1 2 2 1 1 1 1 1 ...
##  $ passage     : Factor w/ 6 levels "1","2","3","4",..: 2 2 2 2 2 2 2 2 3 3 ...
##  $ Freq        : num  21.1 21.9 15.6 21.6 21.9 ...
##  $ source_ratio: num  75 75 25 75 75 25 25 25 25 25 ...
# Step 3: line plot 

# line plot - combine all passages
(p <- ggplot(df_fs1, aes(fresh_ratio, Freq, fill = source_com, color = source_com)) +
    geom_point(size = 1.5, alpha = 0.05, position = position_jitter(width = 4)) +
    scale_x_continuous( breaks = c(0, 25, 50, 75, 100),  labels = c("0:100", "25:75", "50:50", "75:25", "100:0")) +
    facet_grid(. ~ watertype ) +
    geom_smooth(data = df_fs1, method = "lm", aes(color = source_com), size = 0.5, alpha = 0.3) +  
    # geom_smooth(method = "lm", aes(color = source_com), alpha = 0.6) +  # Match regression line colors
    # Plot regression lines only for significant groups
    # Add regression equation, R², and p-value
    stat_poly_eq(
      aes(label = after_stat(paste(..eq.label.., ..rr.label.., ..p.value.label.., sep = "~~~"))),
      formula = y ~ x, 
      parse = TRUE, 
      size = 2.5) +
    labs(x = "Mixing ratio\n(fresh:sea)", y = "Aitchison distance", title = "") +
    scale_fill_manual(values = c("#4575B4", "#D73027"), guide=guide_legend(override.aes = list(shape=21)), name = "Distance to") +
    scale_color_manual(values = c("#4575B4", "#D73027"), name = "Distance to") +
    mytheme +
    theme(legend.position.inside = c(0.95, 0.95)))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

# ggsave(paste0(folder_path, "distance_to_source_communities_line_plot_clr_all_raw.pdf"), width = 6, height = 3, p)



####################
# line plot in facet
####################

# Step 1: Compute p-values for each group
lm_results <- df_fs1 %>%
  group_by(watertype, passage, source_com) %>%
  do({
    model <- lm(Freq ~ source_ratio, data = .)
    tidy(model)  # Extract regression statistics
  }) %>%
  filter(term == "source_ratio") %>%  # Only keep slope term (not intercept)
  mutate(significant = p.value < 0.05)  # Mark significant regressions

# Step 2: Merge with original data to keep significant groups only
df_filtered <- df_fs1 %>%
  inner_join(lm_results %>% filter(significant), by = c("watertype", "passage", "source_com"))


# Step 3: line plot in facet
(p <- ggplot(df_fs1, aes(source_ratio, Freq, fill = source_com, color = source_com)) +
    geom_point(size = 2, alpha = 0.2) +
    # geom_smooth(method = "lm", aes(color = source_com), alpha = 0.6) +  # Match regression line colors
    # Plot regression lines only for significant groups
    geom_smooth(data = df_filtered, method = "lm", aes(color = source_com), alpha = 0.7) +  
    # Add regression equation, R², and p-value
    stat_poly_eq(
      aes(label = after_stat(
        paste(..eq.label.., ..rr.label.., ..p.value.label.., sep = "~~~")
      )),
      formula = y ~ x, 
      parse = TRUE, 
      size = 1.8) +
    facet_grid(watertype~passage) +
    scale_x_continuous(
      breaks = c(0, 25, 50, 75, 100),  # Define break points
      labels = c("0:100", "25:75", "50:50", "75:25", "100:0")) +
    labs(x = "", y = "Aitchison distance to the native/source communities", title = "") +
    scale_fill_manual(values = c("#4575B4", "#D73027"), guide=guide_legend(override.aes = list(shape=21)), name = "Native/source community") + 
    scale_color_manual(values = c("#4575B4", "#D73027"), name = "Native/source community") +
    mytheme +
    theme(legend.position = "bottom"))
## `geom_smooth()` using formula = 'y ~ x'
## Warning in ci_f_ncp(stat, df1 = df1, df2 = df2, probs = probs): Upper limit
## outside search range. Set to the maximum of the parameter range.
## Warning in compute_group(...): CI computation error: Error in check_output(cint, probs = probs, parameter_range = c(0, 1)): out[1] <= out[2] is not TRUE

# ggsave(paste0(folder_path, "distance_to_source_communities_line_plot_raw.png"), width = 10, height = 4, p)


###########################
# dis to source on two axis
###########################

# reorganize the data.frame
df_dis <- df_fs %>% 
  select(-c(mix, mix_simple)) %>% 
  group_by(watertype, mix_ratio, source_com, passage) %>% 
  mutate(seq_number = row_number()) %>%
  pivot_wider(names_from = c(source_com), values_from = Freq) %>% 
  mutate(mix_ratio = factor(mix_ratio, levels = c("0:100", "25:75", "50:50", "75:25", "100:0"))) %>% 
  drop_na()

# check 
summary(df_dis)
##   watertype         mix_ratio   passage   seq_number          sea        
##  Length:840         0:100:132   1:140   Min.   : 1.000   Min.   : 7.933  
##  Class :character   25:75:192   2:140   1st Qu.: 4.000   1st Qu.:15.492  
##  Mode  :character   50:50:192   3:140   Median : 8.000   Median :18.835  
##                     75:25:192   4:140   Mean   : 8.071   Mean   :18.104  
##                     100:0:132   5:140   3rd Qu.:12.000   3rd Qu.:20.751  
##                                 6:140   Max.   :16.000   Max.   :25.858  
##      fresh       
##  Min.   : 9.295  
##  1st Qu.:15.203  
##  Median :18.837  
##  Mean   :18.274  
##  3rd Qu.:20.951  
##  Max.   :28.249
df_dis %>% filter(watertype == "Freshwater" & mix_ratio == "100:0" & passage == "1") %>% print()
## # A tibble: 6 × 6
## # Groups:   watertype, mix_ratio, passage [1]
##   watertype  mix_ratio passage seq_number   sea fresh
##   <chr>      <fct>     <fct>        <int> <dbl> <dbl>
## 1 Freshwater 100:0     1                1  23.7 14.8 
## 2 Freshwater 100:0     1                2  21.5 15.4 
## 3 Freshwater 100:0     1                3  23.2 15.2 
## 4 Freshwater 100:0     1                4  22.9  9.95
## 5 Freshwater 100:0     1                5  22.1 11.2 
## 6 Freshwater 100:0     1                6  22.4 10.6
# write.csv(df_dis, paste0(folder_path, "distance_to_source_com_check.csv"))

# Compute regression statistics for each combination of watertype and passage
regression_results <- df_dis %>%
  group_by(watertype, passage) %>%
  summarize(
    model = list(lm(sea ~ fresh, data = cur_data())),
    p_value = summary(model[[1]])$coefficients[2, 4]  # Extract p-value for fresh
  ) %>%
  filter(p_value < 0.05)  # Keep only significant regressions
## Warning: There was 1 warning in `summarize()`.
## ℹ In argument: `model = list(lm(sea ~ fresh, data = cur_data()))`.
## ℹ In group 1: `watertype = "Freshwater"` and `passage = 1`.
## Caused by warning:
## ! `cur_data()` was deprecated in dplyr 1.1.0.
## ℹ Please use `pick()` instead.
## `summarise()` has grouped output by 'watertype'. You can override using the
## `.groups` argument.
# Merge with original data to keep only groups with significant regressions
df_filtered <- df_dis %>% inner_join(regression_results, by = c("watertype", "passage"))


# plot distance to two source communities
(p <- ggplot(df_dis, aes(x = fresh, y = sea)) +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "darkred", alpha = 0.5, size = 0.8) +  # y = x line
    geom_point(aes(fill = mix_ratio, shape = watertype), size = 2, stroke = 0.1, color = "black", shape = 21, alpha = 0.6) +
    # geom_smooth(data = df_filtered, aes(x = fresh, y = sea), method = "lm", formula = y ~ x, se = FALSE) +  # Regression line only for significant ones
    # stat_regline_equation(data = df_filtered, aes(label = paste(..eq.label.., sep = "~~~")), size = 3) +  # Regression equation
    # stat_cor(data = df_filtered, aes(label = paste("p =", p_value)), size = 3) +  # P-value
    facet_grid(watertype ~ passage) +
    labs(x = "Aitchison to freshwater source communities", y = "Aitchison to seawater source communities", title = "") +
    scale_fill_brewer(palette = "RdYlBu", guide=guide_legend(override.aes = list(shape=21)), name = "Mixing ratio \n(fresh:sea)") +
    xlim(8.1, 24) +  # Set x-axis limits
    ylim(8.1, 24) +  # Set y
    mytheme)
## Warning: Removed 38 rows containing missing values or values outside the scale range
## (`geom_point()`).

# ggsave(paste0(folder_path, "distance_to_source_coms.png"), plot = p, width = 9, height = 3.3)


# Summarize mean and standard error for fresh and sea
df_summary <- df_fs %>%
  select(-c(mix, mix_simple)) %>% 
  group_by(watertype, mix_ratio, passage, source_com) %>%  # Group by relevant factors
  summarise(
    mean = mean(Freq, na.rm = TRUE),
    se = sd(Freq, na.rm = TRUE) / sqrt(n())) %>%
  ungroup() %>% 
  arrange(passage, mix_ratio) %>% 
  pivot_wider(names_from = source_com, values_from = c(mean, se), names_glue = "{.value}_{source_com}") %>% 
  mutate(mix_ratio = factor(mix_ratio, levels = c("0:100", "25:75", "50:50", "75:25", "100:0")))
## `summarise()` has grouped output by 'watertype', 'mix_ratio', 'passage'. You
## can override using the `.groups` argument.
summary(df_summary)
##   watertype         mix_ratio  passage   mean_fresh       mean_sea    
##  Length:60          0:100:12   1:10    Min.   :12.37   Min.   :10.06  
##  Class :character   25:75:12   2:10    1st Qu.:14.69   1st Qu.:15.19  
##  Mode  :character   50:50:12   3:10    Median :18.95   Median :18.65  
##                     75:25:12   4:10    Mean   :18.20   Mean   :17.91  
##                     100:0:12   5:10    3rd Qu.:20.73   3rd Qu.:20.47  
##                                6:10    Max.   :23.27   Max.   :22.50  
##     se_fresh          se_sea      
##  Min.   :0.1906   Min.   :0.1726  
##  1st Qu.:0.3079   1st Qu.:0.3049  
##  Median :0.3765   Median :0.3962  
##  Mean   :0.4394   Mean   :0.4323  
##  3rd Qu.:0.5132   3rd Qu.:0.5120  
##  Max.   :1.2787   Max.   :1.0076
# y = -x+a
a <- max(df_summary$mean_fresh) + min(df_summary$mean_sea)

min_xylim <- min(min(df_summary$mean_sea), min(df_summary$mean_fresh))
max_xylim <- max(max(df_summary$mean_sea), max(df_summary$mean_fresh))

# extract the color codes from the "RdYlBu" palette using the RColorBrewer
# display.brewer.pal(5, "RdYlBu")
# (colors <- brewer.pal(5, "RdYlBu"))
colors <- c("#D7191C", "#FDAE61", "gold2", "#ABD9E9", "#2C7BB6")

# Plot mean and standard error with connections
(p <- ggplot(df_summary, aes(x = mean_fresh, y = mean_sea, shape = passage)) +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "darkred", alpha = 0.2, size = 0.4) +  # y = x line
    # geom_abline(slope = -1, intercept = a, linetype = "dashed", color = "darkblue", alpha = 0.2, size = 0.4) +  # y = x line
    geom_errorbar(aes(ymin = mean_sea - se_sea, ymax = mean_sea + se_sea, color = mix_ratio), linewidth = 0.2, width = 0.1, alpha = 0.1, position = position_dodge(0.3)) +  # SE as error bars
    geom_errorbarh(aes(xmin = mean_fresh - se_fresh, xmax = mean_fresh + se_fresh, color = mix_ratio), linewidth = 0.2, height = 0.1, alpha = 0.1, position = position_dodge(0.3)) +  # Horizontal SE error bars
    geom_path(aes(group = mix_ratio, color = mix_ratio), linewidth = 0.2, alpha= 0.2) +
    # geom_point(aes(fill = mix_ratio, shape = watertype), size = 1, stroke = 0.1, alpha= 0.7, color = "black", shape = 21) +
    geom_text(aes(label = passage, color = mix_ratio), size = 3) +
    # facet_grid(watertype ~ mix_ratio) +
    facet_grid(. ~ watertype) +
    labs(x = "Aitchison to native freshwater communities", y = "Aitchison to native seawater communities", title = "") +
    scale_color_manual(values = colors, name = "Mixing ratio \n(fresh:sea)") +
    xlim(min_xylim, max_xylim) +  
    ylim(min_xylim, max_xylim) +  
    mytheme)
## Warning: `position_dodge()` requires non-overlapping x intervals.
## Warning: `position_dodge()` requires non-overlapping x intervals.
## `position_dodge()` requires non-overlapping x intervals.
## `position_dodge()` requires non-overlapping x intervals.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_errorbarh()`).

# ggsave(paste0(folder_path, "distance_to_source_coms_mean_se_with_lines_facet1.png"), plot = p, width = 5.5, height = 2.8)
# ggsave(paste0(folder_path, "distance_to_source_coms_mean_se_with_lines_nr_raw2.pdf"), plot = p, width = 5.5, height = 2.8)

Session Info

sessionInfo()
## R version 4.4.2 (2024-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Monterey 12.3
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Berlin
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] RColorBrewer_1.1-3 broom_1.0.7        ggpmisc_0.6.1      ggpp_0.5.8-1      
##  [5] circlize_0.4.16    phyloseq_1.50.0    ggpubr_0.6.0       vegan_2.6-8       
##  [9] lattice_0.22-6     permute_0.9-7      lubridate_1.9.4    forcats_1.0.0     
## [13] stringr_1.5.1      dplyr_1.1.4        purrr_1.0.2        readr_2.1.5       
## [17] tidyr_1.3.1        tibble_3.2.1       ggplot2_3.5.2      tidyverse_2.0.0   
## 
## loaded via a namespace (and not attached):
##  [1] polynom_1.4-1           rlang_1.1.4             magrittr_2.0.3         
##  [4] ade4_1.7-22             compiler_4.4.2          mgcv_1.9-1             
##  [7] vctrs_0.6.5             reshape2_1.4.4          quantreg_5.99.1        
## [10] pkgconfig_2.0.3         shape_1.4.6.1           crayon_1.5.3           
## [13] fastmap_1.2.0           backports_1.5.0         XVector_0.46.0         
## [16] labeling_0.4.3          utf8_1.2.4              rmarkdown_2.29         
## [19] tzdb_0.4.0              UCSC.utils_1.2.0        MatrixModels_0.5-3     
## [22] xfun_0.52               zlibbioc_1.52.0         cachem_1.1.0           
## [25] GenomeInfoDb_1.42.1     jsonlite_1.8.9          biomformat_1.34.0      
## [28] rhdf5filters_1.18.0     Rhdf5lib_1.28.0         parallel_4.4.2         
## [31] cluster_2.1.7           R6_2.5.1                bslib_0.8.0            
## [34] stringi_1.8.4           car_3.1-3               jquerylib_0.1.4        
## [37] Rcpp_1.0.13-1           iterators_1.0.14        knitr_1.49             
## [40] IRanges_2.40.1          Matrix_1.7-1            splines_4.4.2          
## [43] igraph_2.1.2            timechange_0.3.0        tidyselect_1.2.1       
## [46] rstudioapi_0.17.1       abind_1.4-8             yaml_2.3.10            
## [49] codetools_0.2-20        plyr_1.8.9              Biobase_2.66.0         
## [52] withr_3.0.2             evaluate_1.0.1          survival_3.7-0         
## [55] Biostrings_2.74.0       confintr_1.0.2          pillar_1.9.0           
## [58] carData_3.0-5           foreach_1.5.2           stats4_4.4.2           
## [61] generics_0.1.3          S4Vectors_0.44.0        hms_1.1.3              
## [64] munsell_0.5.1           scales_1.3.0            glue_1.8.0             
## [67] tools_4.4.2             data.table_1.16.4       SparseM_1.84-2         
## [70] ggsignif_0.6.4          rhdf5_2.50.0            grid_4.4.2             
## [73] ape_5.8                 colorspace_2.1-1        nlme_3.1-166           
## [76] GenomeInfoDbData_1.2.13 Formula_1.2-5           cli_3.6.3              
## [79] fansi_1.0.6             gtable_0.3.6            rstatix_0.7.2          
## [82] sass_0.4.9              digest_0.6.37           BiocGenerics_0.52.0    
## [85] farver_2.1.2            htmltools_0.5.8.1       multtest_2.62.0        
## [88] lifecycle_1.0.4         httr_1.4.7              GlobalOptions_0.1.2    
## [91] MASS_7.3-61